# -*- coding: UTF-8 -*-
import requests
import sys
import pymongo

reload(sys)
sys.setdefaultencoding('utf-8')

from bs4 import BeautifulSoup

connection = pymongo.MongoClient('139.224.135.121', 27017)
db = connection.lianjia
db.authenticate("developer","qwerqwer")
BjDeal = db.BjDeal

i=1
j=1
while i:
    r = requests.get('https://bj.lianjia.com/chengjiao/d'+str(i))
    soup = BeautifulSoup(r.text,'lxml')
    ul = soup.find('ul', attrs={'class': 'listContent'})
    if ul==None:
        break
    agroup = ul.find_all('a',attrs={'class':'img'})
    for a in agroup:
        # 初始化一个字典来存储文章信息
        h = requests.get(a['href'])
        smallsoup = BeautifulSoup(h.text,'lxml')
        total = smallsoup.find('span', attrs={'class':'dealTotalPrice'})
        msg = {}
        # 这里使用一个try except 防止爬虫找不到信息从而停止运行
        try:
            # 开始筛选信息，并保存到字典中
            title = smallsoup.find('title')
            msg['title'] = title.text
            total = smallsoup.find('span', attrs={'class': 'dealTotalPrice'})
            total = total.find('i')
            msg['total'] = total.text

            totalAtFirst = smallsoup.find('div', attrs={'class':'msg'})
            totalAtFirst = totalAtFirst.find('span').find('label')
            totalAtFirst = totalAtFirst.text

            price = smallsoup.find('div', attrs={'class':'price'})
            price = price.find('b')
            msg['price'] = price.text
            info = smallsoup.find('div', attrs={'class':'newwrap baseinform'})

            base = info.find('div', attrs={'class':'base'})
            lists = base.find_all('li')
            for li in lists:
                msg[li.find('span').text] = li.text.strip('\"').strip()
            transaction = info.find('div', attrs={'class': 'transaction'})
            lists = base.find_all('li')
            for li in lists:
                msg[li.find('span').text] = li.text.strip('\"').strip()

            msg['transtime'] = smallsoup.find('p',attrs={'class':'record_detail'}).text[-10:-1]
            BjDeal.insert(msg)
            print "第"+str(j)+"条数据已插入数据库"
            j = j+1
        except:
            print('出了点小问题')
    i=i+1
